#load libraries and required ASER datasets
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(knitr)
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(ggplot2)

setwd("/Users/Arunima/Desktop/D3SR")

aser_school_2011 <- read.csv("/Users/Arunima/Desktop/D3SR/ASER School Data 2007-18/ASER 2011/ASER2011-School.txt")

aser_household_2011 <- read.csv("/Users/Arunima/Desktop/D3SR/ASER Household_Village Data 2007-18/ASER 2011/ASER 2011 Household Data.txt")
#select the required variables from the ASER 2011 household dataset

aser_household_2011 <- aser_household_2011 %>%
  group_by(state_name, district_name) 

aser_hh_selected <- aser_household_2011 %>%
  select(state_code, district_code, state_name, district_name, hh_id, child_age, child_gender, hh_tv, hh_mobile, hh_toilet, hh_electricity_conn, hh_type, school_class, oos_never_enr, oos_dropout, oos_dropout_class, school_govt, school_private, school_madarsa, school_other, tuition, read_nothing, read_letter, read_word, read_level_1, read_level_2, math_nothing, math_num_1_9, math_num_10_99, math_subtraction, math_division, father_class, hh_multiplier, mother_age, mother_class, vlg_pucca_road, vlg_electricity, vlg_std_booth, vlg_bank, vlg_private_health_clinic, vlg_internet_cafe, vlg_solar_energy, vlg_private_school)
#select the required variables from the ASER 2011 school level dataset
aser_school_2011 <- aser_school_2011 %>%
  group_by(state_name, district_name) 

aser_school_selected <- aser_school_2011 %>%
  select(state_code, district_code, state_name, district_name, schtype_class, childenrollment_class_1, childenrollment_class_2, childenrollment_class_3, childenrollment_class_4, childenrollment_class_5, childenrollment_class_6, childenrollment_class_7, childenrollment_class_8, regulargovtteacher_appointment, midday_meal_in_school, boundary_wall, computer_in_school, computer_in_school_usable, girls_toilet_in_school, boys_toilet_in_school)

ASER HOUSEHOLD LEVEL DATA 2011

#Creating Indicators

##Create an indicator for Arithematic Score based on the level of learning

aser_hh_selected <- aser_hh_selected %>%
  mutate(math_score = case_when(
    math_division == 1 ~ 4,
    math_subtraction == 1 ~ 3,
    math_num_10_99 == 1 ~ 2,
    math_num_1_9 == 1 ~ 1,
    math_nothing == 1 ~ 0,
    TRUE ~ NA_real_
  )) %>% 
mutate(math_score_scaled = ifelse(!is.na(math_score), math_score / 4, NA)) # scales down the indicator to 4

## Create an indicator for Reading Scores based on the ability to read. This creates an ordinal variable

aser_hh_selected <- aser_hh_selected %>%
  mutate(reading_score = case_when(
    read_level_2 == 1 ~ 4,
    read_level_1 == 1 ~ 3,
    read_word == 1 ~ 2,
    read_letter == 1 ~ 1,
    read_nothing == 1 ~ 0,
    TRUE ~ NA_real_
  ))%>% 
mutate(reading_score_scaled = ifelse(!is.na(reading_score), reading_score / 4, NA)) %>% # scales down the indicator to 4
  mutate(composite_score= (reading_score_scaled+math_score_scaled)/2) # creating a new variable called composite score to know the education outcome

#create a variable for school enrollment
aser_hh_selected <- aser_hh_selected %>%
  mutate(enrolled_any_school = case_when(
    oos_dropout == 0 ~ 0,
    school_govt == 1 | school_private == 1 | school_madarsa == 1 | school_other == 1 ~ 1,
    TRUE ~ 0
  )) 

## creating a binary variable for dropout (Dropout = 1, not dropout = 0)

aser_hh_selected <- aser_hh_selected %>%
 mutate(dropout = if_else(is.na(oos_dropout), 1, 0))

#creating category variables for mother's level of education according to ASER's manual

aser_hh_selected$mother_education_cat <- cut(
  aser_hh_selected$mother_class,
  breaks = c(-1, 0, 5, 8, 10, 12, 15, 17, 18),
  labels = c("No formal education", 
             "Primary (1–5)", 
             "Upper Primary (6–8)", 
             "Secondary (9–10)", 
             "Higher Secondary (11–12)", 
             "Bachelors (13–15)", 
             "Postgraduate (16–17)", 
             "Diploma (18)"),
  right = TRUE
)

library(knitr) #loading knitr to use kable function

# View the category distribution in table format
kable(table(aser_hh_selected$mother_education_cat)) 
Var1 Freq
No formal education 296
Primary (1–5) 91820
Upper Primary (6–8) 77547
Secondary (9–10) 66336
Higher Secondary (11–12) 22377
Bachelors (13–15) 7913
Postgraduate (16–17) 1620
Diploma (18) 533
## Creating Household Characteristics Index to control for any confounding variables in the environment

aser_hh_selected <- aser_hh_selected %>%
  ungroup() %>% 
  mutate(
    hh_tv = as.numeric(ifelse(hh_tv == 1, 1,
                              ifelse(hh_tv == 2, 0, NA))),
    hh_mobile = as.numeric(ifelse(hh_mobile == 1, 1,
                                  ifelse(hh_mobile == 2, 0, NA))),
    hh_toilet = as.numeric(ifelse(hh_toilet == 1, 1,
                                  ifelse(hh_toilet == 2, 0, NA))),
    hh_electricity_conn = as.numeric(ifelse(hh_electricity_conn == 1, 1,
                                            ifelse(hh_electricity_conn == 2, 0, NA))),
    
# Scaling hh_type from 1–3 to 0–1 and creating a column for the same
    hh_type_scaled = ifelse(hh_type %in% 1:3, (hh_type - 1) / 2, NA)
  ) %>%
  mutate(
    household_characteristics_index = rowMeans(select(., 
      hh_tv,
      hh_mobile,
      hh_electricity_conn,
      hh_type_scaled
    ), na.rm = TRUE)
  )

# creating a new column avg_household_characteristics_index to calculate household characteristics at district level

household_characteristics_index <- aser_hh_selected %>%
  mutate(district_name = tolower(trimws(district_name))) %>% 
  group_by(district_name) %>%
  summarise(
    avg_household_characteristics_index = mean(household_characteristics_index, na.rm = TRUE),
    .groups = 'drop'
  )
#Aggregating data to the district level using household multiplier which indicates the number of households the sampled household represents in the population of the district and creating a new dataframe

average_HH_district_summary <- aser_hh_selected %>%
  mutate(district_name = tolower(trimws(district_name))) %>% 
  group_by(district_name) %>%
  summarise(
    avg_reading = weighted.mean(reading_score_scaled, weights = hh_multiplier, na.rm = TRUE),
    avg_arithmetic = weighted.mean(math_score_scaled, weights = hh_multiplier, na.rm = TRUE),
    avg_child_edu = weighted.mean(composite_score, weights = hh_multiplier, na.rm = TRUE),
    avg_enrollment = weighted.mean(enrolled_any_school, weights = hh_multiplier, na.rm = TRUE),
    avg_paternal_edu = weighted.mean(father_class, weights = hh_multiplier, na.rm = TRUE),
    avg_maternal_edu = weighted.mean(mother_class, weights = hh_multiplier, na.rm = TRUE),
    avg_enrollment_ratio = weighted.mean(enrolled_any_school, weights = hh_multiplier, na.rm = TRUE),
    avg_dropout = weighted.mean(dropout, weights = hh_multiplier, na.rm = TRUE),
    avg_mother_class = weighted.mean(mother_class, weights = hh_multiplier, na.rm = TRUE))

# merging the district characteristics and household characteristics data frames by district name to make a new data frame called household_dist_summary

 household_dist_summary<- left_join(average_HH_district_summary,household_characteristics_index, by = "district_name")

##ASER SCHOOL LEVEL DATA 2011

#aggregating child primary school enrolment across classes 1 to 8 to know total enrollment per school
aser_school_selected <- aser_school_selected %>%
  ungroup() %>% 
  mutate(childenrolment = rowSums(select(., childenrollment_class_1:childenrollment_class_8), na.rm = TRUE))

#creating a new variable to measure teacher student ratio based on the number of appointed teachers and child enrollment in a school

aser_school_selected <- aser_school_selected %>% 
  mutate(teacher_student_ratio = regulargovtteacher_appointment/childenrolment)

#creating School Quality Index to control for the confounders due school infrastructure

aser_school_selected <- aser_school_selected %>%
  mutate(
    midday_meal_in_school = as.numeric(ifelse(midday_meal_in_school == 1, 1,
                                      ifelse(midday_meal_in_school == 2, 0, NA))),
    boundary_wall = as.numeric(ifelse(boundary_wall == 1, 1,
                              ifelse(boundary_wall == 2, 0, NA))),
    computer_in_school = as.numeric(ifelse(computer_in_school == 1, 1,
                                  ifelse(computer_in_school == 2, 0, NA))),
    computer_in_school_usable = as.numeric(ifelse(computer_in_school_usable == 1, 1,
                                          ifelse(computer_in_school_usable == 2, 0, NA))),
    girls_toilet_in_school = as.numeric(ifelse(girls_toilet_in_school == 1, 1,
                                        ifelse(girls_toilet_in_school == 2, 0, NA))),
    boys_toilet_in_school = as.numeric(ifelse(boys_toilet_in_school == 1, 1,
                                       ifelse(boys_toilet_in_school == 2, 0, NA)))
  ) %>%
  mutate(
    school_quality_index = rowMeans(select(.,  #creating a new column for school quality index
      midday_meal_in_school,
      boundary_wall,
      computer_in_school,
      computer_in_school_usable,
      girls_toilet_in_school,
      boys_toilet_in_school
    ), na.rm = TRUE)
  )

#creating a new dataframe to average school quality characteristics at district level

school_quality_index <- aser_school_selected %>%
  mutate(district_name = tolower(trimws(district_name))) %>% 
  group_by(district_name) %>%
  summarise(
    avg_school_quality_index = mean(school_quality_index, na.rm = TRUE),
    childenrolment = mean(childenrolment, na.rm = TRUE),
    .groups = 'drop'
  ) 

#creating a new data frame for teacher student ratio
TSR_DF <- aser_school_selected %>%
    mutate(district_name = tolower(trimws(district_name))) %>% 
  group_by(district_name) %>%  # or whatever your district ID column is
  summarise(
    total_students = sum(childenrolment, na.rm = TRUE),
    total_teachers = sum(regulargovtteacher_appointment , na.rm = TRUE),  # assuming 'teachers' is your column
    district_tsr = total_students / total_teachers,
    .groups = 'drop'
  )

#merging all the school level data frames by district name to consolidate into one dataframe
merged_school <- left_join(school_quality_index,TSR_DF, by = "district_name")

#merging both household level and school level data frames to create one data frame for ASER data
merged_aser <- left_join(household_dist_summary,merged_school, by = "district_name")

Population Census 2011 (SHRUG)

#Loading PC11 Data
pca11 <- read.csv("/Users/Arunima/Desktop/D3SR/shrug-pca11-csv/pc11_pca_clean_pc11dist.csv")

#adding state names
state_ids <- read.csv("/Users/Arunima/Downloads/state_key.csv")
pca11 <- merge(pca11, state_ids, by = "pc11_state_id", all = TRUE)

#adding district names
district_ids <- read.csv("/Users/Arunima/Downloads/dis_key.csv")
pca11 <- merge(pca11, district_ids, by = "pc11_district_id", all = TRUE)

#selecting required variables and making two new variables
pca11 <- pca11 %>%
  select(pc11_state_id, pc11_district_id, pc11_pca_tot_p, pc11_pca_p_sc, pc11_pca_p_st, pc11_pca_tot_f, pc11_pca_f_lit, pc11_pca_tot_work_p, pc11_pca_tot_work_f, pc11_pca_non_work_f, pc11_pca_non_work_p, pc11_pca_no_hh, district_name, state_name) %>% 
  mutate(female_lfp = ((1 - (pc11_pca_non_work_f / pc11_pca_tot_f))*100) )%>% #creating a new variable for female labour force participation
  mutate(SC_share = (pc11_pca_p_sc / pc11_pca_tot_p)* 100) %>% #creating a new variable to know the proportion of SC population as a proportion of the total population 
  mutate(ST_share = (pc11_pca_p_st / pc11_pca_tot_p)* 100) #creating a new variable to know the proportion of ST population as a proportion of the total population 

Merging Datasets

#merging the Population Census Abstract 2011 data frame with the ASER data frame
merged <- left_join(pca11, merged_aser, by = "district_name")

# Loading the Night line data for urban controls
night_light_dist <- read.csv("/Users/Arunima/Desktop/D3SR/shrug-viirs-annual-csv/viirs_annual_pc11dist.csv")%>%
  group_by(pc11_district_id) %>%
  summarise(viirs_annual_mean = mean(viirs_annual_mean, na.rm = TRUE), .groups = 'drop') #summarising the data at district level

#merging the above data frame with night time light data
merged2 <- left_join(merged, night_light_dist, by = "pc11_district_id",relationship="many-to-many")

# saving the cv in desktop for backup
write.csv(merged2, "/Users/Arunima/Desktop/D3SR/merged2.csv", row.names = FALSE)

Final Cleaning

# Defining a vector of variable names that we want to work with

vars_used <- c("avg_arithmetic", "avg_maternal_edu", "female_lfp",
                "viirs_annual_mean", "avg_paternal_edu", "SC_share", "ST_share",
                "district_tsr", "avg_school_quality_index", "avg_household_characteristics_index")
 
# Creating a new data frame that selects only the variables in vars_used then filters rows where any of the numeric variables have NA, NaN, or infinite values
  
 merged2_check <- merged2 %>%
   select(all_of(vars_used)) %>%
   filter_if(is.numeric, any_vars(is.na(.) | is.nan(.) | is.infinite(.)))
 
 # creates a data frame without NA values
merged2_clean <- merged2 %>%
  filter(
    if_all(any_of(vars_used), ~ !is.na(.) & !is.nan(.) & is.finite(.))
  )

District Characteristics index

# Cleaning and transforming village-level indicators into binary variables 

merged3_clean <- aser_hh_selected %>%
  mutate(
    vlg_pucca_road = ifelse(vlg_pucca_road == 1, 1,
                            ifelse(vlg_pucca_road == 2, 0, NA)),
    vlg_electricity = ifelse(vlg_electricity == 1, 1,
                             ifelse(vlg_electricity == 2, 0, NA)),
    vlg_std_booth = ifelse(vlg_std_booth == 1, 1,
                           ifelse(vlg_std_booth == 2, 0, NA)),
    vlg_bank = ifelse(vlg_bank == 1, 1,
                      ifelse(vlg_bank == 2, 0, NA)),
    vlg_private_health_clinic = ifelse(vlg_private_health_clinic == 1, 1,
                                       ifelse(vlg_private_health_clinic == 2, 0, NA)),
    vlg_internet_cafe = ifelse(vlg_internet_cafe == 1, 1,
                               ifelse(vlg_internet_cafe == 2, 0, NA)),
    vlg_solar_energy = ifelse(vlg_solar_energy == 1, 1,
                              ifelse(vlg_solar_energy == 2, 0, NA)),
    vlg_private_school = ifelse(vlg_private_school == 1, 1,
                                ifelse(vlg_private_school == 2, 0, NA))
  ) %>%
# Creating a new variable district_development_index by averaging the binary indicators
  mutate(
    district_development_index = rowMeans(select(., 
      vlg_pucca_road,
      vlg_electricity,
      vlg_std_booth,
      vlg_bank,
      vlg_private_health_clinic,
      vlg_internet_cafe,
      vlg_solar_energy,
      vlg_private_school
    ), na.rm = TRUE)
  )

## aggregating to district level by calculating the average index value per district as a new variable

district_characteristics_index <- merged3_clean %>%
  mutate(district_name = tolower(trimws(district_name))) %>% 
  group_by(district_name) %>%
  summarise(
    avg_district_characteristics_index = mean(district_development_index, na.rm = TRUE),
    .groups = 'drop'
  )

# merging the district-level development index into the cleaned merged2 data frame by district name

merged2_clean <- merged2_clean %>% 
  left_join(district_characteristics_index, by = "district_name" )

REGRESSIONS

#running the preliminary regression with the dependent variable (avg_child_edu), independent variable (avg_maternal_edu) and all the controld

model_11 <- lm(
  avg_child_edu ~ avg_maternal_edu + female_lfp + 
    viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
    district_tsr + avg_school_quality_index + avg_household_characteristics_index + avg_district_characteristics_index,
  data = merged2_clean
)

summary(model_11)
## 
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp + 
##     viirs_annual_mean + avg_paternal_edu + SC_share + ST_share + 
##     district_tsr + avg_school_quality_index + avg_household_characteristics_index + 
##     avg_district_characteristics_index, data = merged2_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.213758 -0.041057  0.001879  0.041641  0.180013 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          2.735e-01  3.585e-02   7.627 1.60e-13 ***
## avg_maternal_edu                     3.154e-02  4.929e-03   6.398 4.17e-10 ***
## female_lfp                           1.249e-03  3.508e-04   3.561 0.000412 ***
## viirs_annual_mean                   -5.212e-03  2.555e-03  -2.040 0.041924 *  
## avg_paternal_edu                    -6.773e-03  5.124e-03  -1.322 0.186899    
## SC_share                            -3.457e-04  4.604e-04  -0.751 0.453101    
## ST_share                            -3.742e-05  1.943e-04  -0.193 0.847361    
## district_tsr                        -1.461e-05  3.807e-05  -0.384 0.701312    
## avg_school_quality_index             3.483e-02  2.565e-02   1.358 0.175247    
## avg_household_characteristics_index  2.504e-01  2.714e-02   9.226  < 2e-16 ***
## avg_district_characteristics_index  -3.115e-02  3.472e-02  -0.897 0.370120    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06513 on 424 degrees of freedom
## Multiple R-squared:  0.5363, Adjusted R-squared:  0.5254 
## F-statistic: 49.04 on 10 and 424 DF,  p-value: < 2.2e-16

Regressions with Controls

# Running a linear regression model (Model 1) with average maternal education  as the only predictor of average child education
model_1 <- lm(
  avg_child_edu ~ avg_maternal_edu,
  data = merged2_clean
)

summary(model_1)
## 
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu, data = merged2_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.252287 -0.053027  0.002396  0.056917  0.201797 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.288183   0.026625   10.82   <2e-16 ***
## avg_maternal_edu 0.047136   0.003546   13.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07976 on 433 degrees of freedom
## Multiple R-squared:  0.2898, Adjusted R-squared:  0.2882 
## F-statistic: 176.7 on 1 and 433 DF,  p-value: < 2.2e-16
#Running Model 2 by adding female literacy rate and female labor force participation as controls to the basic model
model_2 <- lm(
  avg_child_edu ~ avg_maternal_edu + female_lfp,
  data = merged2_clean
)

summary(model_2)
## 
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp, data = merged2_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.219101 -0.053890  0.001482  0.051807  0.196542 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.2084250  0.0285878   7.291 1.48e-12 ***
## avg_maternal_edu 0.0502205  0.0034373  14.610  < 2e-16 ***
## female_lfp       0.0019861  0.0003198   6.210 1.25e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07651 on 432 degrees of freedom
## Multiple R-squared:  0.348,  Adjusted R-squared:  0.345 
## F-statistic: 115.3 on 2 and 432 DF,  p-value: < 2.2e-16
# Running Model 3 by including more covariates: night light intensity, paternal education, SC/ST share, teacher-student ratio, school quality, and household characteristics

model_3 <- lm(
  avg_child_edu ~ avg_maternal_edu + female_lfp + 
    viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
    district_tsr + avg_school_quality_index + avg_household_characteristics_index,
  data = merged2_clean
)

summary(model_3)
## 
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp + 
##     viirs_annual_mean + avg_paternal_edu + SC_share + ST_share + 
##     district_tsr + avg_school_quality_index + avg_household_characteristics_index, 
##     data = merged2_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.215157 -0.042040  0.002236  0.042159  0.179477 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          2.751e-01  3.580e-02   7.686 1.06e-13 ***
## avg_maternal_edu                     3.074e-02  4.848e-03   6.342 5.82e-10 ***
## female_lfp                           1.273e-03  3.496e-04   3.642 0.000304 ***
## viirs_annual_mean                   -5.024e-03  2.545e-03  -1.974 0.049033 *  
## avg_paternal_edu                    -6.881e-03  5.121e-03  -1.344 0.179801    
## SC_share                            -2.781e-04  4.541e-04  -0.612 0.540618    
## ST_share                            -1.423e-05  1.925e-04  -0.074 0.941114    
## district_tsr                        -1.721e-05  3.795e-05  -0.453 0.650452    
## avg_school_quality_index             2.580e-02  2.359e-02   1.094 0.274690    
## avg_household_characteristics_index  2.421e-01  2.551e-02   9.490  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06511 on 425 degrees of freedom
## Multiple R-squared:  0.5354, Adjusted R-squared:  0.5256 
## F-statistic: 54.42 on 9 and 425 DF,  p-value: < 2.2e-16
# Running Model 4 by further adding the district-level development index to the previous full model
model_4 <- lm(
  avg_child_edu ~ avg_maternal_edu + female_lfp + 
    viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
    district_tsr + avg_school_quality_index + avg_household_characteristics_index + avg_district_characteristics_index,
  data = merged2_clean
)

summary(model_4)
## 
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp + 
##     viirs_annual_mean + avg_paternal_edu + SC_share + ST_share + 
##     district_tsr + avg_school_quality_index + avg_household_characteristics_index + 
##     avg_district_characteristics_index, data = merged2_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.213758 -0.041057  0.001879  0.041641  0.180013 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          2.735e-01  3.585e-02   7.627 1.60e-13 ***
## avg_maternal_edu                     3.154e-02  4.929e-03   6.398 4.17e-10 ***
## female_lfp                           1.249e-03  3.508e-04   3.561 0.000412 ***
## viirs_annual_mean                   -5.212e-03  2.555e-03  -2.040 0.041924 *  
## avg_paternal_edu                    -6.773e-03  5.124e-03  -1.322 0.186899    
## SC_share                            -3.457e-04  4.604e-04  -0.751 0.453101    
## ST_share                            -3.742e-05  1.943e-04  -0.193 0.847361    
## district_tsr                        -1.461e-05  3.807e-05  -0.384 0.701312    
## avg_school_quality_index             3.483e-02  2.565e-02   1.358 0.175247    
## avg_household_characteristics_index  2.504e-01  2.714e-02   9.226  < 2e-16 ***
## avg_district_characteristics_index  -3.115e-02  3.472e-02  -0.897 0.370120    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06513 on 424 degrees of freedom
## Multiple R-squared:  0.5363, Adjusted R-squared:  0.5254 
## F-statistic: 49.04 on 10 and 424 DF,  p-value: < 2.2e-16
# Loading the texreg library and displaying all four models side-by-side. specifying to round off at 4 digits

library(texreg)
## Version:  1.39.4
## Date:     2024-07-23
## Author:   Philip Leifeld (University of Manchester)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
## 
## Attaching package: 'texreg'
## The following object is masked from 'package:tidyr':
## 
##     extract
screenreg(
  list(model_1, model_2, model_3, model_4), 
  column.labels = c("Model 1", "Model 2", "Model 3", "Model 4"),
  digits = 4
)
## 
## ===========================================================================================
##                                      Model 1       Model 2       Model 3       Model 4     
## -------------------------------------------------------------------------------------------
## (Intercept)                            0.2882 ***    0.2084 ***    0.2751 ***    0.2735 ***
##                                       (0.0266)      (0.0286)      (0.0358)      (0.0359)   
## avg_maternal_edu                       0.0471 ***    0.0502 ***    0.0307 ***    0.0315 ***
##                                       (0.0035)      (0.0034)      (0.0048)      (0.0049)   
## female_lfp                                           0.0020 ***    0.0013 ***    0.0012 ***
##                                                     (0.0003)      (0.0003)      (0.0004)   
## viirs_annual_mean                                                 -0.0050 *     -0.0052 *  
##                                                                   (0.0025)      (0.0026)   
## avg_paternal_edu                                                  -0.0069       -0.0068    
##                                                                   (0.0051)      (0.0051)   
## SC_share                                                          -0.0003       -0.0003    
##                                                                   (0.0005)      (0.0005)   
## ST_share                                                          -0.0000       -0.0000    
##                                                                   (0.0002)      (0.0002)   
## district_tsr                                                      -0.0000       -0.0000    
##                                                                   (0.0000)      (0.0000)   
## avg_school_quality_index                                           0.0258        0.0348    
##                                                                   (0.0236)      (0.0257)   
## avg_household_characteristics_index                                0.2421 ***    0.2504 ***
##                                                                   (0.0255)      (0.0271)   
## avg_district_characteristics_index                                              -0.0311    
##                                                                                 (0.0347)   
## -------------------------------------------------------------------------------------------
## R^2                                    0.2898        0.3480        0.5354        0.5363    
## Adj. R^2                               0.2882        0.3450        0.5256        0.5254    
## Num. obs.                            435           435           435           435         
## ===========================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

State Fixed Effects

# Load the package
library(lfe)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
# Run the fixed effects model
model_a <- felm(avg_child_edu ~ avg_maternal_edu + female_lfp + viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
     district_tsr + avg_school_quality_index + avg_household_characteristics_index + avg_district_characteristics_index | 
     state_name,  # Fixed effect (e.g., state)
     data = merged2_clean)

summary(model_a)
## 
## Call:
##    felm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp +      viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +      district_tsr + avg_school_quality_index + avg_household_characteristics_index +      avg_district_characteristics_index | state_name, data = merged2_clean) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18293 -0.03278 -0.00180  0.03729  0.16111 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## avg_maternal_edu                     1.989e-02  6.016e-03   3.306  0.00103 ** 
## female_lfp                           1.193e-03  4.315e-04   2.765  0.00597 ** 
## viirs_annual_mean                   -4.710e-03  2.378e-03  -1.981  0.04828 *  
## avg_paternal_edu                     8.971e-03  5.772e-03   1.554  0.12090    
## SC_share                            -4.103e-04  5.445e-04  -0.754  0.45157    
## ST_share                            -3.341e-04  2.421e-04  -1.380  0.16836    
## district_tsr                         2.557e-05  4.278e-05   0.598  0.55028    
## avg_school_quality_index             1.450e-01  3.508e-02   4.134 4.35e-05 ***
## avg_household_characteristics_index  2.296e-01  3.595e-02   6.387 4.74e-10 ***
## avg_district_characteristics_index  -6.441e-02  4.079e-02  -1.579  0.11511    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05378 on 396 degrees of freedom
## Multiple R-squared(full model): 0.7047   Adjusted R-squared: 0.6763 
## Multiple R-squared(proj model): 0.335   Adjusted R-squared: 0.2712 
## F-statistic(full model):24.86 on 38 and 396 DF, p-value: < 2.2e-16 
## F-statistic(proj model): 19.95 on 10 and 396 DF, p-value: < 2.2e-16
list(model_1, model_2, model_3, model_4)
## [[1]]
## 
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu, data = merged2_clean)
## 
## Coefficients:
##      (Intercept)  avg_maternal_edu  
##          0.28818           0.04714  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp, data = merged2_clean)
## 
## Coefficients:
##      (Intercept)  avg_maternal_edu        female_lfp  
##         0.208425          0.050220          0.001986  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp + 
##     viirs_annual_mean + avg_paternal_edu + SC_share + ST_share + 
##     district_tsr + avg_school_quality_index + avg_household_characteristics_index, 
##     data = merged2_clean)
## 
## Coefficients:
##                         (Intercept)                     avg_maternal_edu  
##                           2.751e-01                            3.074e-02  
##                          female_lfp                    viirs_annual_mean  
##                           1.273e-03                           -5.024e-03  
##                    avg_paternal_edu                             SC_share  
##                          -6.881e-03                           -2.781e-04  
##                            ST_share                         district_tsr  
##                          -1.423e-05                           -1.721e-05  
##            avg_school_quality_index  avg_household_characteristics_index  
##                           2.580e-02                            2.421e-01  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp + 
##     viirs_annual_mean + avg_paternal_edu + SC_share + ST_share + 
##     district_tsr + avg_school_quality_index + avg_household_characteristics_index + 
##     avg_district_characteristics_index, data = merged2_clean)
## 
## Coefficients:
##                         (Intercept)                     avg_maternal_edu  
##                           2.735e-01                            3.154e-02  
##                          female_lfp                    viirs_annual_mean  
##                           1.249e-03                           -5.212e-03  
##                    avg_paternal_edu                             SC_share  
##                          -6.773e-03                           -3.457e-04  
##                            ST_share                         district_tsr  
##                          -3.742e-05                           -1.461e-05  
##            avg_school_quality_index  avg_household_characteristics_index  
##                           3.483e-02                            2.504e-01  
##  avg_district_characteristics_index  
##                          -3.115e-02
 library(texreg)

screenreg(
  list(model_a),  # Supplying a list of one or more models for side-by-side display
  type = "latex",  
  digits = 4,  # Setting the number of decimal places being shown in the output to 4
  single.row = TRUE, # Displaying coefficients and standard errors in a single row for compactness
  custom.model.names = c("Model A"),
  custom.coef.names = c( # # Renaming coefficients for clearer interpretation
    "Maternal Education", 
    "Female LFPR", 
    "Night Time Lights", 
    "Paternal Education", 
    "SCs", 
    "STs", 
    "District TSR", 
    "School Quality Index", 
    "HH Characteristics Index", 
    "District Characteristics Index"
  ),
  caption = "Regression Results"
)
## 
## =====================================================
##                                 Model A              
## -----------------------------------------------------
## Maternal Education                0.0199 (0.0060) ** 
## Female LFPR                       0.0012 (0.0004) ** 
## Night Time Lights                -0.0047 (0.0024) *  
## Paternal Education                0.0090 (0.0058)    
## SCs                              -0.0004 (0.0005)    
## STs                              -0.0003 (0.0002)    
## District TSR                      0.0000 (0.0000)    
## School Quality Index              0.1450 (0.0351) ***
## HH Characteristics Index          0.2296 (0.0359) ***
## District Characteristics Index   -0.0644 (0.0408)    
## -----------------------------------------------------
## Num. obs.                       435                  
## R^2 (full model)                  0.7047             
## R^2 (proj model)                  0.3350             
## Adj. R^2 (full model)             0.6763             
## Adj. R^2 (proj model)             0.2712             
## Num. groups: state_name          29                  
## =====================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Regional Fixed Effects

# Load necessary libraries
library(lfe)      # For felm
library(dplyr)    # For data manipulation
library(texreg)   # For regression tables

# Define state lists with names in lowercase, using spaces to match merged2_clean
north_states <- c("jammu kashmir", "himachal pradesh", "punjab", "uttarakhand", "haryana", "delhi", "uttar pradesh", "chandigarh")
south_states <- c("andhra pradesh", "telangana", "karnataka", "kerala", "tamil nadu", "puducherry")
east_states  <- c("bihar", "jharkhand", "odisha", "west bengal")
west_states  <- c("rajasthan", "gujarat", "maharashtra", "goa", "dadra nagar haveli daman diu", "daman diu")
northeast_states <- c("arunachal pradesh", "nagaland", "manipur", "mizoram", "tripura", "meghalaya", "assam")
central_states <- c("chhattisgarh", "madhya pradesh")

# Combine into a named list
indian_regions <- list(
  North = north_states,
  South = south_states,
  East = east_states,
  West = west_states,
  Northeast = northeast_states,
  Central = central_states
)

# Function to run regression for a region
run_region_model <- function(region, data, region_states) {
  # Filter data for the region
  region_data <- data %>% filter(state_name %in% region_states)
  
  # Check if there is data
  if (nrow(region_data) > 0) {
    # Run fixed effects model
    model <- felm(
      avg_child_edu ~ avg_maternal_edu + female_lfp + viirs_annual_mean + 
                      avg_paternal_edu + SC_share + ST_share + district_tsr + 
                      avg_school_quality_index + avg_household_characteristics_index + 
                      avg_district_characteristics_index | state_name,
      data = region_data
    )
    return(model)
  } else {
    warning(paste("No data for region:", region))
    return(NULL)
  }
}

# Run models for all regions using lapply
model_region_fe <- lapply(names(indian_regions), function(region) {
  run_region_model(region, merged2_clean, indian_regions[[region]])
})

# Remove NULL entries (regions with no data)
model_region_fe <- model_region_fe[!sapply(model_region_fe, is.null)]

# Name= Name models for screenreg
names(model_region_fe) <- names(indian_regions)[!sapply(model_region_fe, is.null)]

# Display results using screenreg
screenreg(
  model_region_fe,
  type = "latex",
  digits = 4,
  single.row = TRUE,
  custom.model.names = names(model_region_fe),
  custom.coef.names = c(
    "Maternal Education", 
    "Female LFPR", 
    "Night Time Lights", 
    "Paternal Education", 
    "SCs", 
    "STs", 
    "District TSR", 
    "School Quality Index", 
    "HH Characteristics Index", 
    "District Characteristics Index"
  ),
  caption = "Regional Fixed Effects Regression Results"
)
## 
## ==========================================================================================================================================================
##                                 North                  South                East                West                Northeast           Central           
## ----------------------------------------------------------------------------------------------------------------------------------------------------------
## Maternal Education                0.0253 (0.0095) **    0.0109 (0.0154)      0.0039 (0.0197)     0.0109 (0.0134)     0.0087 (0.0244)     0.0146 (0.0183)  
## Female LFPR                       0.0030 (0.0006) ***   0.0009 (0.0011)     -0.0021 (0.0014)     0.0006 (0.0013)    -0.0003 (0.0016)     0.0032 (0.0014) *
## Night Time Lights                -0.0039 (0.0032)      -0.0023 (0.0028)     -0.0189 (0.0162)    -0.0026 (0.0161)     0.0147 (0.0347)     0.0091 (0.0273)  
## Paternal Education                0.0285 (0.0108) **   -0.0125 (0.0123)      0.0321 (0.0195)     0.0114 (0.0124)     0.0304 (0.0207)    -0.0033 (0.0143)  
## SCs                               0.0004 (0.0007)      -0.0023 (0.0011) *    0.0006 (0.0012)     0.0000 (0.0018)     0.0013 (0.0045)    -0.0053 (0.0031)  
## STs                               0.0007 (0.0004)      -0.0008 (0.0013)      0.0003 (0.0009)    -0.0006 (0.0007)     0.0008 (0.0007)    -0.0023 (0.0009) *
## District TSR                     -0.0001 (0.0001)      -0.0026 (0.0009) **   0.0002 (0.0001) *  -0.0011 (0.0005) *   0.0004 (0.0002)    -0.0000 (0.0001)  
## School Quality Index              0.1028 (0.0608)       0.1793 (0.1101)      0.1365 (0.0791)     0.0467 (0.1304)     0.0905 (0.0842)     0.2144 (0.1089)  
## HH Characteristics Index          0.1742 (0.0562) **    0.5030 (0.1536) **   0.1626 (0.0973)     0.1763 (0.1098)     0.2689 (0.1108) *   0.0198 (0.1134)  
## District Characteristics Index   -0.0039 (0.0613)      -0.1932 (0.0791) *    0.2052 (0.1351)     0.0078 (0.1099)    -0.0425 (0.1166)    -0.0099 (0.1564)  
## ----------------------------------------------------------------------------------------------------------------------------------------------------------
## Num. obs.                       108                    79                   70                  76                  58                  44                
## R^2 (full model)                  0.9008                0.6540               0.6426              0.5850              0.6958              0.5605           
## R^2 (proj model)                  0.6805                0.4655               0.5152              0.4564              0.4026              0.4014           
## Adj. R^2 (full model)             0.8846                0.5783               0.5596              0.4898              0.5770              0.4095           
## Adj. R^2 (proj model)             0.6284                0.3485               0.4026              0.3317              0.1695              0.1956           
## Num. groups: state_name           6                     5                    4                   5                   7                   2                
## ==========================================================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Removing multicollinearity

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
# Combine the correlated variables into a matrix
pca_data <- merged2_clean[, c("avg_maternal_edu", "avg_paternal_edu")]

# Apply PCA
pca_result <- prcomp(pca_data, scale. = TRUE)

# Add the first principal component as a new variable
merged2_clean$pca_education <- pca_result$x[, 1]

# Now you can use "pca_education" in your regression model

model <- lm(
  avg_child_edu ~ avg_maternal_edu + female_lfp +
    viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
    district_tsr + avg_school_quality_index +
    avg_household_characteristics_index + avg_district_characteristics_index,
  data = merged2_clean
)

# Check summary and coefficients
summary(model)
## 
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp + 
##     viirs_annual_mean + avg_paternal_edu + SC_share + ST_share + 
##     district_tsr + avg_school_quality_index + avg_household_characteristics_index + 
##     avg_district_characteristics_index, data = merged2_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.213758 -0.041057  0.001879  0.041641  0.180013 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          2.735e-01  3.585e-02   7.627 1.60e-13 ***
## avg_maternal_edu                     3.154e-02  4.929e-03   6.398 4.17e-10 ***
## female_lfp                           1.249e-03  3.508e-04   3.561 0.000412 ***
## viirs_annual_mean                   -5.212e-03  2.555e-03  -2.040 0.041924 *  
## avg_paternal_edu                    -6.773e-03  5.124e-03  -1.322 0.186899    
## SC_share                            -3.457e-04  4.604e-04  -0.751 0.453101    
## ST_share                            -3.742e-05  1.943e-04  -0.193 0.847361    
## district_tsr                        -1.461e-05  3.807e-05  -0.384 0.701312    
## avg_school_quality_index             3.483e-02  2.565e-02   1.358 0.175247    
## avg_household_characteristics_index  2.504e-01  2.714e-02   9.226  < 2e-16 ***
## avg_district_characteristics_index  -3.115e-02  3.472e-02  -0.897 0.370120    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06513 on 424 degrees of freedom
## Multiple R-squared:  0.5363, Adjusted R-squared:  0.5254 
## F-statistic: 49.04 on 10 and 424 DF,  p-value: < 2.2e-16
library(texreg)

screenreg(model,
          digits = 3,
          single.row = TRUE,
          stars = c(0.01, 0.05, 0.1),
          custom.coef.names = c("Intercept",
                                "Avg. Maternal Education",
                                "Female LFP",
                                "Night Lights (VIIRS)",
                                "Avg. Paternal Education",
                                "SC Share",
                                "ST Share",
                                "TSR Score",
                                "School Quality Index",
                                "Household Characteristics Index",
                                "District Characteristics Index"))
## 
## ====================================================
##                                  Model 1            
## ----------------------------------------------------
## Intercept                          0.273 (0.036) ***
## Avg. Maternal Education            0.032 (0.005) ***
## Female LFP                         0.001 (0.000) ***
## Night Lights (VIIRS)              -0.005 (0.003) ** 
## Avg. Paternal Education           -0.007 (0.005)    
## SC Share                          -0.000 (0.000)    
## ST Share                          -0.000 (0.000)    
## TSR Score                         -0.000 (0.000)    
## School Quality Index               0.035 (0.026)    
## Household Characteristics Index    0.250 (0.027) ***
## District Characteristics Index    -0.031 (0.035)    
## ----------------------------------------------------
## R^2                                0.536            
## Adj. R^2                           0.525            
## Num. obs.                        435                
## ====================================================
## *** p < 0.01; ** p < 0.05; * p < 0.1

Visualisation

merged2_clean$predicted_values <- predict(model_4)

# Calculate residuals (observed - predicted)
merged2_clean$residuals <- merged2_clean$avg_arithmetic - merged2_clean$predicted_values

# Create a plot
plot(merged2_clean$predicted_values, merged2_clean$residuals, 
     main="Residuals vs Predicted", 
     xlab="Predicted Values", ylab="Residuals")

# Add a horizontal line at 0 for better visualization
abline(h = 0, col = "red")

# Highlight outliers (e.g., define an outlier if residuals are > 3 standard deviations)
outliers <- merged2_clean[abs(merged2_clean$residuals) > 3*sd(merged2_clean$residuals), ]

# Add labels for outliers
text(outliers$predicted_values, outliers$residuals, 
     labels = rownames(outliers), pos = 4, col = "blue")

# Correlational Matrix

library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
# Selecting a subset of variables from merged2_clean
new_Df <- merged2_clean %>% 
  select(female_lfp:avg_district_characteristics_index)

# Converting all columns to numeric where possible
 numeric_data <- data.frame(lapply(new_Df, function(x) {
   if (is.numeric(x)) return(x)
   suppressWarnings(as.numeric(as.character(x)))
 }))
 
# Identifying columns that have zero variance
zero_var_cols <- sapply(numeric_data, function(x) sd(x, na.rm = TRUE) == 0)
names(zero_var_cols[zero_var_cols == TRUE])  # Show column names with zero variance
## character(0)
# Removing columns that became entirely NA after conversion
numeric_data <- numeric_data[, colSums(!is.na(numeric_data)) > 0]

# computing correlation
cor_matrix <- cor(numeric_data, use = "complete.obs")

# Melting the correlation matrix for ggplot2 compatibility
melted_cor <- melt(cor_matrix)

# Creating a heatmap with colored tiles and correlation values as text

heatmap <- ggplot(melted_cor, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), size = 4, color = "black") +
  scale_fill_gradient2(low = "darkblue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 10, hjust = 1)) +
  coord_fixed()

#displaying the map
heatmap

# Saving the heatmap as a PNG file
ggsave("plot.png", plot = heatmap, width = 8, height = 6)

Map Plotting

# Read in India shapefile (adjust path or use geojson)
dist_shp <- st_read("/Users/Arunima/Desktop/D3SR/shrug-pc11dist-poly-shp/district.shp")  # or use a geojson
## Reading layer `district' from data source 
##   `/Users/Arunima/Desktop/D3SR/shrug-pc11dist-poly-shp/district.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 641 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 68.13327 ymin: 6.754878 xmax: 97.40441 ymax: 37.07827
## Geodetic CRS:  WGS 84
#renmaing colum to match for merging
dist_shp <- dist_shp %>%
  rename(pc11_district_id = pc11_d_id)

dist_shp <- dist_shp %>%
  mutate(pc11_district_id = as.integer(pc11_district_id))

merged2_clean <- merged2_clean %>%
  mutate(pc11_district_id = as.integer(pc11_district_id))

# Joining map with data
dist_shp <- dist_shp %>%
  left_join(merged2_clean, by = "pc11_district_id")

#plotting the map for avergae child education over India
ggplot(dist_shp) +
  geom_sf(aes(fill = avg_child_edu), color = NA) +
  scale_fill_viridis_c(name = "Avg. Child Outcome", na.value = "black") +
  theme_minimal() +
  labs(
    title = "Average Children's Education Level by District",
    caption = "Data Source: ASER + 2011 Census"
  )

#Box Plots

library(ggplot2)
library(dplyr)

# Bin avg_mother_edu into categories
merged2_clean <- merged2_clean %>%
  mutate(mother_edu_bin = cut(avg_maternal_edu,
                              breaks = quantile(avg_maternal_edu, probs = seq(0, 1, 0.2), na.rm = TRUE),
                              include.lowest = TRUE,
                              labels = c("Very Low", "Low", "Medium", "High", "Very High")))

# Plot the boxplot
ggplot(merged2_clean, aes(x = mother_edu_bin, y = avg_child_edu)) +
  geom_boxplot(fill = "skyblue") +
  labs(
    title = "Boxplot of Average Child Education by Mother's Education Level",
    x = "Mother's Education (Binned)",
    y = "Average Years of Child Education"
  ) +
  theme_minimal()

# Bin female_lfp
merged2_clean <- merged2_clean %>%
  mutate(female_lfp_bin = cut(female_lfp,
                              breaks = quantile(female_lfp, probs = seq(0, 1, 0.2), na.rm = TRUE),
                              include.lowest = TRUE,
                              labels = c("Very Low", "Low", "Medium", "High", "Very High")))

# Plot
ggplot(merged2_clean, aes(x = female_lfp_bin, y = avg_child_edu)) +
  geom_boxplot(fill = "lightgreen") +
  labs(
    title = "Boxplot of Average Child Education by Female Labour Force Participation",
    x = "Female Labour Force Participation (Binned)",
    y = "Average Years of Child Education"
  ) +
  theme_minimal()

# Boxplot of avg_child_edu by District
ggplot(merged2_clean, aes(x = "", y = avg_child_edu)) +
  geom_boxplot(fill = "cyan", width = 0.2) +  # reduce width here
  labs(title = "Distribution of Average Child Education Across Districts",
       y = "Average Years of Child Education",
       x = "") +
  theme_minimal()

# Normalize the district index to range from 0 to 1
merged2_clean$district_index <- scales::rescale(as.numeric(factor(merged2_clean$district_name)))

# Identify extremes
extremes <- merged2_clean %>%
  filter(avg_child_edu == min(avg_child_edu, na.rm = TRUE) |
         avg_child_edu == max(avg_child_edu, na.rm = TRUE))

# Remove extremes from merged2_clean for plotting
merged2_clean_no_extremes <- merged2_clean %>%
  filter(!(avg_child_edu %in% extremes$avg_child_edu))

# Scatterplot (excluding extremes)
ggplot(merged2_clean_no_extremes, aes(x = avg_maternal_edu, y = avg_child_edu)) +
  geom_point(alpha = 0.6, color = "black") +
  geom_point(data = extremes, aes(x = district_index, y = avg_child_edu), color = "red", size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "solid") +  # Adding the trend line
  labs(title = "Scatterplot of Average Child Education Across Districts",
       x = "District Index (Normalized)",
       y = "Average Years of Child Education") +
  scale_x_continuous(limits = c(4.5, NA)) +  # Set x-axis to start at 4.5
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Load required packages
library(ggplot2)
library(dplyr)
library(sf)         # For maps
library(ggthemes)
library(RColorBrewer)
library(cowplot)    # For combining plots
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(viridis)    # Color palettes
## Loading required package: viridisLite
library(ggpubr)     # For regression lines
## Registered S3 method overwritten by 'broom':
##   method    from
##   nobs.felm lfe
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
## 
##     get_legend
library(tidyr)


child_edu_mother_edu <- ggplot(merged2, aes(x = avg_maternal_edu, y = avg_child_edu)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(title = "Mother's Education vs Child's Composite Test Score",
       x = "Mother's Years of Schooling",
       y = "Child Test Score") +
  theme_minimal()

child_edu_mother_edu
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 205 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 205 rows containing missing values or values outside the scale range
## (`geom_point()`).

read_mother_edu <- ggplot(merged2, aes(x = avg_maternal_edu, y = avg_reading)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(title = "Mother's Education vs Child's Reading Test Score",
       x = "Mother's Years of Schooling",
       y = "Child Test Score") +
  theme_minimal()

read_mother_edu
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 205 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 205 rows containing missing values or values outside the scale range
## (`geom_point()`).

math_mother_edu <- ggplot(merged2, aes(x = avg_maternal_edu, y = avg_arithmetic)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(title = "Mother's Education vs Child's Arithmetic Test Score",
       x = "Mother's Years of Schooling",
       y = "Child Test Score") +
  theme_minimal()
math_mother_edu
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 205 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 205 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Ensure the district ID in merged2 is a character with leading zeros to match the shapefile
merged2 <- merged2 %>%
  mutate(pc11_district_id = sprintf("%03d", as.integer(pc11_district_id)))


#mapping mothers level of education


summary(merged2_clean)
##  pc11_state_id   pc11_district_id pc11_pca_tot_p     pc11_pca_p_sc    
##  Min.   : 1.00   Min.   :  1.0    Min.   :    8004   Min.   :      0  
##  1st Qu.: 9.00   1st Qu.:164.0    1st Qu.: 1092055   1st Qu.: 118404  
##  Median :18.00   Median :318.0    Median : 1847023   Median : 292144  
##  Mean   :17.33   Mean   :324.8    Mean   : 2161090   Mean   : 361354  
##  3rd Qu.:24.50   3rd Qu.:494.0    3rd Qu.: 2985834   3rd Qu.: 508104  
##  Max.   :34.00   Max.   :637.0    Max.   :11060148   Max.   :2168304  
##  pc11_pca_p_st     pc11_pca_tot_f    pc11_pca_f_lit    pc11_pca_tot_work_p
##  Min.   :      0   Min.   :   3590   Min.   :   1822   Min.   :   3555    
##  1st Qu.:   8845   1st Qu.: 529640   1st Qu.: 269162   1st Qu.: 462379    
##  Median :  67180   Median : 893164   Median : 491775   Median : 751930    
##  Mean   : 175871   Mean   :1051181   Mean   : 584051   Mean   : 861377    
##  3rd Qu.: 231888   3rd Qu.:1427611   3rd Qu.: 792352   3rd Qu.:1149398    
##  Max.   :1564369   Max.   :5195070   Max.   :3635765   Max.   :4492767    
##  pc11_pca_tot_work_f pc11_pca_non_work_f pc11_pca_non_work_p pc11_pca_no_hh   
##  Min.   :   1327     Min.   :   2263     Min.   :   4449     Min.   :   1952  
##  1st Qu.: 122845     1st Qu.: 372393     1st Qu.: 643450     1st Qu.: 227110  
##  Median : 234266     Median : 658186     Median :1097721     Median : 375873  
##  Mean   : 272066     Mean   : 779115     Mean   :1299713     Mean   : 445447  
##  3rd Qu.: 361767     3rd Qu.:1060028     3rd Qu.:1751370     3rd Qu.: 602288  
##  Max.   :1239177     Max.   :4262784     Max.   :6567381     Max.   :2529165  
##  district_name       state_name          female_lfp        SC_share     
##  Length:435         Length:435         Min.   : 7.631   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.:18.682   1st Qu.: 8.492  
##  Mode  :character   Mode  :character   Median :28.637   Median :16.409  
##                                        Mean   :28.616   Mean   :15.197  
##                                        3rd Qu.:37.806   3rd Qu.:20.865  
##                                        Max.   :64.039   Max.   :50.170  
##     ST_share        avg_reading     avg_arithmetic   avg_child_edu   
##  Min.   : 0.0000   Min.   :0.2754   Min.   :0.2440   Min.   :0.2597  
##  1st Qu.: 0.4449   1st Qu.:0.6005   1st Qu.:0.5386   1st Qu.:0.5722  
##  Median : 3.7993   Median :0.6732   Median :0.6137   Median :0.6476  
##  Mean   :15.8327   Mean   :0.6664   Mean   :0.6090   Mean   :0.6385  
##  3rd Qu.:17.1247   3rd Qu.:0.7374   3rd Qu.:0.6838   3rd Qu.:0.7101  
##  Max.   :97.8188   Max.   :0.8731   Max.   :0.8481   Max.   :0.8596  
##  avg_enrollment   avg_paternal_edu avg_maternal_edu avg_enrollment_ratio
##  Min.   :0.5971   Min.   : 5.473   Min.   : 4.645   Min.   :0.5971      
##  1st Qu.:0.8002   1st Qu.: 7.949   1st Qu.: 6.717   1st Qu.:0.8002      
##  Median :0.8514   Median : 8.556   Median : 7.351   Median :0.8514      
##  Mean   :0.8428   Mean   : 8.561   Mean   : 7.431   Mean   :0.8428      
##  3rd Qu.:0.8932   3rd Qu.: 9.250   3rd Qu.: 7.997   3rd Qu.:0.8932      
##  Max.   :0.9784   Max.   :11.199   Max.   :11.225   Max.   :0.9784      
##   avg_dropout     avg_mother_class avg_household_characteristics_index
##  Min.   :0.8755   Min.   : 4.645   Min.   :0.1649                     
##  1st Qu.:0.9645   1st Qu.: 6.717   1st Qu.:0.4775                     
##  Median :0.9815   Median : 7.351   Median :0.6373                     
##  Mean   :0.9746   Mean   : 7.431   Mean   :0.6304                     
##  3rd Qu.:0.9906   3rd Qu.: 7.997   3rd Qu.:0.8046                     
##  Max.   :1.0000   Max.   :11.225   Max.   :0.9555                     
##  avg_school_quality_index childenrolment   total_students  total_teachers  
##  Min.   :0.06667          Min.   : 16.76   Min.   :  159   Min.   :  4.00  
##  1st Qu.:0.47933          1st Qu.: 93.85   1st Qu.: 2504   1st Qu.: 40.00  
##  Median :0.60705          Median :138.37   Median : 3787   Median : 69.00  
##  Mean   :0.58886          Mean   :165.01   Mean   : 4653   Mean   : 83.34  
##  3rd Qu.:0.70829          3rd Qu.:190.07   3rd Qu.: 5483   3rd Qu.:110.00  
##  Max.   :0.96190          Max.   :654.37   Max.   :18955   Max.   :358.00  
##   district_tsr     viirs_annual_mean   avg_district_characteristics_index
##  Min.   :   8.48   Min.   : 0.000602   Min.   :0.1417                    
##  1st Qu.:  32.55   1st Qu.: 0.353918   1st Qu.:0.3498                    
##  Median :  46.74   Median : 0.604526   Median :0.4184                    
##  Mean   :  85.77   Mean   : 0.904440   Mean   :0.4386                    
##  3rd Qu.:  90.73   3rd Qu.: 1.055772   3rd Qu.:0.5060                    
##  Max.   :1053.50   Max.   :19.192452   Max.   :0.9376                    
##  pca_education       predicted_values   residuals          mother_edu_bin
##  Min.   :-4.042045   Min.   :0.4735   Min.   :-0.22947   Very Low :87    
##  1st Qu.:-0.899107   1st Qu.:0.5806   1st Qu.:-0.07554   Low      :87    
##  Median :-0.002101   Median :0.6373   Median :-0.02902   Medium   :87    
##  Mean   : 0.000000   Mean   :0.6385   Mean   :-0.02947   High     :87    
##  3rd Qu.: 0.963014   3rd Qu.:0.6932   3rd Qu.: 0.01371   Very High:87    
##  Max.   : 3.713781   Max.   :0.8129   Max.   : 0.18805                   
##    female_lfp_bin district_index  
##  Very Low :87     Min.   :0.0000  
##  Low      :87     1st Qu.:0.2477  
##  Medium   :87     Median :0.5000  
##  High     :87     Mean   :0.4991  
##  Very High:87     3rd Qu.:0.7477  
##                   Max.   :1.0000
# Select and reshape
plot_data <- dist_shp %>%
  select(geometry, avg_arithmetic, avg_household_characteristics_index, avg_child_edu) %>%
  pivot_longer(cols = c(avg_arithmetic, avg_household_characteristics_index),
               names_to = "variable", values_to = "value")

# Plot
ggplot(plot_data) +
  geom_sf(aes(fill = value), color = "black", size = 0.1) +
  scale_fill_viridis_c(na.value = "gray") +
  facet_wrap(~ variable) +
  theme_minimal() +
  labs(title = "District-Level Comparison", fill = "Value")

### Overlay map
ggplot() +
  # First layer: average arithmetic
  geom_sf(data = dist_shp, aes(fill = avg_child_edu), color = NA, alpha = 0.6) +
  
  # Second layer: average reading

  geom_sf(data = dist_shp, aes(fill = avg_maternal_edu), color = NA, alpha = 0.6) +
  
  # Third layer: household_characteristics_index
  geom_sf(data = dist_shp, aes(fill = avg_household_characteristics_index), color = NA, alpha = 0.5) +
  
  scale_fill_viridis_c(na.value = "gray") +
  theme_minimal() +
  labs(title = "Overlay: Child Education, Maternal Education & Household Index") +
  theme(legend.position = "none")

library(patchwork)  # For side-by-side plots
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
## 
##     align_plots
p1 <- ggplot(dist_shp) +
  geom_sf(aes(fill = avg_maternal_edu), color = "black", size = 0.1) +
  scale_fill_viridis_c(option = "viridis", na.value = "gray") +
  theme_minimal() +
  labs(title = "District-wise Average Years of Maternal Education", fill = "Score")

p2 <- ggplot(dist_shp) +
  geom_sf(aes(fill = female_lfp), color = "black", size = 0.1) +
  scale_fill_viridis_c(option = "viridis", na.value = "gray") +
  theme_minimal() +
  labs(title = "District-wise Female Labor Force Participation Rate", fill = "Score")

# Display side by side
p1 + p2

#map to plot average child education with hiusehold characteristics
ggplot() +
  geom_sf(data = dist_shp, aes(fill = avg_child_edu), color = "grey90") +
  geom_sf(data = dist_shp %>% filter(avg_household_characteristics_index > 0.5),
          fill = NA, color = "black", size = 0.4) +
  scale_fill_viridis_c(na.value = "gray") +
  theme_minimal() +
  labs(title = "Average Child Education with HH Index")

Child Education by Region and Urban/Rural Classification

merged2_clean <- merged2_clean %>%
  mutate(zone = case_when(
    state_name %in% c("jammu kashmir", "himachal pradesh", "punjab", "uttarakhand", "haryana", "delhi", "uttar pradesh", "chandigarh") ~ "North",
    state_name %in% c("andhra pradesh", "telangana", "karnataka", "kerala", "tamil nadu", "puducherry") ~ "South",
    state_name %in% c("bihar", "jharkhand", "odisha", "west bengal") ~ "East",
    state_name %in% c("rajasthan", "gujarat", "maharashtra", "goa", "dadra nagar haveli daman diu", "daman diu") ~ "West",
    state_name %in% c("arunachal pradesh", "nagaland", "manipur", "mizoram", "tripura", "meghalaya", "assam") ~ "Northeast",
    state_name %in% c("chhattisgarh", "madhya pradesh") ~ "Central",
    TRUE ~ NA_character_
  ))

viirs_cutoff <- quantile(merged2_clean$viirs_annual_mean, 0.95, na.rm = TRUE)

# Step 2: Add urban_dummy to merged2_clean itself
merged2_clean <- merged2_clean %>%
  mutate(urban_dummy = ifelse(viirs_annual_mean >= viirs_cutoff, 1, 0)) %>% 
  mutate(urban_label = ifelse(urban_dummy == 1, "Urban", "Rural"))

merged2_clean <- merged2_clean %>%
  mutate(
    urban_label = ifelse(urban_dummy == 1, "Urban", "Rural")
  )

merged2_clean <- merged2_clean %>%
  filter(!is.na(zone))

ggplot(merged2_clean, aes(x = zone, y = avg_child_edu, fill = urban_label)) +
  geom_boxplot(position = position_dodge(0.8), width = 0.6) +
  labs(
    title = "Child Education by Region and Urban/Rural Classification",
    x = "Region",
    y = "Average Arithmetic Score",
    fill = "Location Type"
  ) +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Child Education by Region and Urban/Rural Classification

merged2_clean <- merged2_clean %>%
  mutate(zone = case_when(
    state_name %in% c("jammu kashmir", "himachal pradesh", "punjab", "uttarakhand", "haryana", "delhi", "uttar pradesh", "chandigarh") ~ "North",
    state_name %in% c("andhra pradesh", "telangana", "karnataka", "kerala", "tamil nadu", "puducherry") ~ "South",
    state_name %in% c("bihar", "jharkhand", "odisha", "west bengal") ~ "East",
    state_name %in% c("rajasthan", "gujarat", "maharashtra", "goa", "dadra nagar haveli daman diu", "daman diu") ~ "West",
    state_name %in% c("arunachal pradesh", "nagaland", "manipur", "mizoram", "tripura", "meghalaya", "assam") ~ "Northeast",
    state_name %in% c("chhattisgarh", "madhya pradesh") ~ "Central",
    TRUE ~ NA_character_
  ))


library(ggplot2)

ggplot(merged2_clean, aes(x = avg_maternal_edu, y = avg_child_edu, color = zone)) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_brewer(palette = "Set1", na.value = "grey") +
  labs(
    title = "Average Child Education vs. Average Maternal Education by Zone",
    x = "Average Maternal Education (Years)",
    y = "Average Child Education (Score)",
    color = "Zone"
  ) +
  theme_minimal()

The End

Thank you!